import numpy as np
import pandas as pd
import seaborn as sns
pd.options.plotting.backend = "matplotlib"
pd.set_option('max_columns',100)
import time,os,json
time_start_notebook = time.time()
home = os.path.expanduser('~')
SEED=100
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import sklearn
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
import joblib
import lightgbm as lgb
[(x.__name__,x.__version__) for x in [np,pd,sns,sklearn,lgb]]
def print_scores(ytest,ypreds):
# for auc score we need to binarize
labels = [0, 1, 2, 3]
ytest_bin = label_binarize(ytest, classes=labels)
ypreds_bin = label_binarize(ypreds, classes=labels)
a = roc_auc_score(ytest_bin,ypreds_bin,
average='macro',multi_class='ovo')
# precision recall
p = precision_score(ytest,ypreds,average='macro')
r = recall_score(ytest,ypreds,average='macro')
f = f1_score(ytest,ypreds,average='macro')
print(f'Precision: {p: .2f}')
print(f'Recall : {r: .2f}')
print(f'F1-score : {f: .2f}')
print(f'AUC : {f: .2f}')
c = classification_report(ytest, ypreds)
print(c)
cm = confusion_matrix(ytest,ypreds)
names = list('ABCD')
df_cm = pd.DataFrame(cm,index=names,columns=names)
df_cm = df_cm.style.background_gradient()
display(df_cm)
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'Precision':[],
'Recall':[],
'F1':[],
'AUC': []
})
df_raw = pd.read_csv('../data/raw/train.csv')
print(df_raw.shape)
df_raw.head(2).append(df_raw.tail(2))
df = pd.read_csv('../data/processed/clean_data.csv')
print(df.shape)
df.head(2).append(df.tail(2))
from sklearn.model_selection import train_test_split
target = 'Segmentation'
df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
df.drop(target,axis=1), df[target],shuffle=True,
test_size=0.2, random_state=SEED, stratify=df[target])
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_Xtrain_orig, ser_ytrain_orig,
test_size=0.2, random_state=SEED, stratify=ser_ytrain_orig)
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()
print(f"df : {df.shape}")
print(f"\ndf_Xtrain_orig : {df_Xtrain_orig.shape}")
print(f"ser_ytrain_orig: {ser_ytrain_orig.shape}")
print(f"\ndf_Xtrain : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}")
print(f"\ndf_Xvalid : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}")
print(f"\ndf_Xtest : {df_Xtest.shape}")
print(f"ser_ytest : {ser_ytest.shape}")
df_Xtrain_orig.head(2)
cols_all = df_Xtrain_orig.columns.to_list()
cols_no_na = [i for i in cols_all if not i.endswith('_NA')]
cross = ['gen_mar', 'gen_grad', 'gen_spend',
'grad_spend', 'grad_spend_gen']
cols_no_na_no_cross = [i for i in cols_no_na if i not in cross]
cols_no_na_no_cross
features = cols_no_na
Xtr = df_Xtrain_orig[features]
Xtx = df_Xtest[features]
Xvd = df_Xvalid[features]
ytr = ser_ytrain_orig.to_numpy().ravel()
ytx = ser_ytest.to_numpy().ravel()
yvd = ser_yvalid.to_numpy().ravel()
dtrain = lgb.Dataset(Xtr, ytr)
dvalid = lgb.Dataset(Xvd, yvd, reference=dtrain)
# I have used dtrain and dvalid in optuna grid search only.
# lgb required raw dataframe for prediction, no need of dtest.
features = cols_all
Xtr = df_Xtrain_orig[features]
Xtx = df_Xtest[features]
Xvd = df_Xvalid[features]
ytr = ser_ytrain_orig.to_numpy().ravel()
ytx = ser_ytest.to_numpy().ravel()
yvd = ser_yvalid.to_numpy().ravel()
model_grid_random = joblib.load('../outputs/lgb_randomsearch_best_model.pkl')
model_grid_random
# time
time_start = time.time()
model_name = 'lightgbm'
desc = 'grid_randomsearch'
# model
clf_lgb = model_grid_random
# fit and save the model
clf_lgb.fit(Xtr, ytr,eval_metric='multi_error')
# predictions
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(clf_lgb, Xtx, ytx, cv=skf)
ypreds = ypreds_cv
# auc
labels = [0, 1, 2, 3]
ytest_bin = label_binarize(ytest, classes=labels)
ypreds_bin = label_binarize(ypreds, classes=labels)
auc = roc_auc_score(ytest_bin,ypreds_bin,
average='macro',multi_class='ovo')
# model evaluation
average = 'macro'
row_eval = [model_name,desc,
accuracy_score(ytx, ypreds),
precision_score(ytx, ypreds, average=average),
recall_score(ytx, ypreds, average=average),
f1_score(ytx, ypreds, average=average),
auc
]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
print_scores(ytest,ypreds)
df.head(2)
df.shape
arr_fimp = clf_lgb.feature_importances_
df_fimp = pd.DataFrame(arr_fimp,columns=['Importances'],index=df.columns.drop(target))
df_fimp = df_fimp.sort_values('Importances',ascending=False)
plt.figure(figsize=(12,12))
ax = sns.barplot(x=df_fimp.Importances, y= df_fimp.index);
for p in ax.patches:
x = p.get_width()
y = p.get_y()
text = '{:.2f}'.format(p.get_width())
ax.text(x, y,text,fontsize=15,color='indigo',va='top',ha='left')
import eli5
eli5.show_weights(clf_lgb)
from eli5.sklearn import PermutationImportance
feature_names = df_Xtrain.columns.tolist()
perm = PermutationImportance(clf_lgb).fit(df_Xtest, ytx)
eli5.show_weights(perm, feature_names=feature_names)
df.head(2)
idx = 0
example = df_Xtest.iloc[idx]
answer = ser_ytest.iloc[idx]
feature_names = df_Xtest.columns.tolist()
prediction = clf_lgb.predict(example.to_numpy().reshape(-1,1).T)
print(f'answer = {answer}')
print('prediction = ', prediction[0])
print()
print(example)
print(feature_names)
df.head(2)
import lime
import lime.lime_tabular
categorical_features = ['Gender', 'Ever_Married', 'Graduated',
'Ever_Married_NA', 'Graduated_NA', 'Profession_NA',
'Work_Experience_NA', 'Family_Size_NA', 'Var_1_NA',
'Age_cat', 'Family_Size_cat', 'Work_Experience_cat',
'Profession_Artist', 'Profession_Doctor',
'Profession_Engineer', 'Profession_Entertainment',
'Profession_Executive', 'Profession_Healthcare',
'Profession_Homemaker', 'Profession_Lawyer',
'Profession_Marketing',
'Var_1_Cat_1', 'Var_1_Cat_2', 'Var_1_Cat_3',
'Var_1_Cat_4', 'Var_1_Cat_5', 'Var_1_Cat_6',
'Var_1_Cat_7'
]
categorical_features_idx = [df_Xtrain.columns.get_loc(col)
for col in categorical_features]
feature_names = df_Xtrain.columns
NUM_FEATURES = len(feature_names)
explainer = lime.lime_tabular.LimeTabularExplainer(df_Xtrain.to_numpy(),
feature_names=feature_names,
categorical_features = categorical_features_idx,
class_names=list('ABCD'),
mode='classification')
exp = explainer.explain_instance(example, clf_lgb.predict_proba,
num_features=NUM_FEATURES)
exp.show_in_notebook(show_table=True)
ax = exp.as_pyplot_figure(); # use semicolon
ax.set_figheight(12);
import shap
shap.initjs()
%%time
explainer = shap.TreeExplainer(clf_lgb)
shap_values = np.array(explainer.shap_values(df_Xtest))
df_Xtest.shape, explainer.expected_value, type(explainer.expected_value), len(explainer.expected_value)
idx = 5
shap.force_plot(explainer.expected_value[1],
shap_values[1][idx,:],
df_Xtest.iloc[idx,:] # this is just for giving feature names
)
# many points
NUM = 1000
shap.force_plot(explainer.expected_value[1],
shap_values[1][:NUM,:],
df_Xtest.iloc[:NUM,:] # this is just for giving feature names
)
shap_values.shape, df_Xtest.shape
shap.summary_plot(shap_values[0], df_Xtest)